The following steps prepare the dataset for the modeling part. The actual feature analysis was done in another kernel.
Loading Data
library(data.table) # database-like tables
library(Matrix) # sparse matrices
library(ggplot2) # general plots
library(mice) # imputation
library(xgboost) # xtreme gradient boosting
library(crossval) # doing CV
library(ICEbox) # partial dependency plots
test <- data.table(read.csv("../input/test.csv"))
train <- data.table(read.csv("../input/train.csv"))
test$SalePrice <- NaN
full <- rbind(train, test)
full$label <- rep(c("train", "test"), c(nrow(train), nrow(test)))
Feature Attributes
# types
nomVars <- c("MSZoning", "Street", "Alley", "LandContour", "LotConfig",
"Neighborhood", "Condition1", "Condition2", "HouseStyle",
"RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
"MasVnrType", "Foundation", "Heating", "CentralAir", "Electrical",
"GarageType", "PavedDrive", "MiscFeature", "SaleType",
"SaleCondition")
ordVars <- c("MSSubClass", "LotShape", "Utilities", "LandSlope", "BldgType",
"OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual",
"BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
"HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
"GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence",
"MoSold")
discVars <- c("YearBuilt", "YearRemodAdd", "BsmtFullBath", "BsmtHalfBath",
"FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
"TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars",
"YrSold")
contVars <- c("LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
"TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF",
"EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea",
"MiscVal", "LotFrontage")
featAttr <- data.table(
name = colnames(test)[-1],
type = "nominal"
)
featAttr[name %in% ordVars, type := "ordinal"]
featAttr[name %in% discVars, type := "discrete"]
featAttr[name %in% contVars, type := "continuous"]
# special values
full[is.na(Alley), Alley := "noAccess"]
full[is.na(BsmtQual), BsmtQual := "noBasement"]
full[is.na(BsmtCond), BsmtCond := "noBasement"]
full[is.na(BsmtExposure), BsmtExposure := "noBasement"]
full[is.na(BsmtFinType1), BsmtFinType1 := "noBasement"]
full[is.na(BsmtFinType2), BsmtFinType2 := "noBasement"]
full[is.na(FireplaceQu), FireplaceQu := "noFireplace"]
full[is.na(GarageType), GarageType := "noGarage"]
full[is.na(GarageFinish), GarageFinish := "noGarage"]
full[is.na(GarageQual), GarageQual := "noGarage"]
full[is.na(GarageCond), GarageCond := "noGarage"]
full[is.na(PoolQC) , PoolQC := "noPool"]
full[is.na(Fence), Fence := "noFence"]
full[is.na(MiscFeature), MiscFeature := "none"]
full[GarageYrBlt == 2207, GarageYrBlt := 2007]
# variable types
for (j in c(nomVars, ordVars)) full[[j]] <- as.factor(full[[j]])
for (j in contVars) full[[j]] <- as.numeric(full[[j]])
for (j in discVars) full[[j]] <- as.integer(full[[j]])
# order levels
lvlOrd <- list(
LotShape = c("Reg", "IR1", "IR2", "IR3"),
Utilities = c("AllPub", "NoSewr", "NoSeWa", "ELO"),
LandSlope = c("Gtl", "Mod", "Sev"),
BldgType = c("1Fam", "2FmCon", "Duplx", "TwnhsE", "TwnhsI"),
OverallQual = 10:1,
OverallCond = 10:1,
ExterQual = c("Ex", "Gd", "TA", "Fa", "Po"),
ExterCond = c("Ex", "Gd", "TA", "Fa", "Po"),
BsmtQual = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtCond = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtExposure = c("Gd", "Av", "Mn", "No", "noBasement"),
BsmtFinType1 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
BsmtFinType2 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
HeatingQC = c("Ex", "Gd", "TA", "Fa", "Po"),
KitchenQual = c("Ex", "Gd", "TA", "Fa", "Po"),
Functional = c("Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"),
FireplaceQu = c("Ex", "Gd", "TA", "Fa", "Po", "noFireplace"),
GarageFinish = c("Fin", "RFn", "Unf", "noGarage"),
GarageQual = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
GarageCond = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
PoolQC = c("Ex", "Gd", "TA", "Fa", "noPool"),
Fence = c("GdPrv", "MnPrv", "GdWo", "MnWw", "noFence"),
MoSold = 1:12
)
for (j in names(lvlOrd)) {
full[[j]] <- factor(full[[j]], levels = lvlOrd[[j]])
}
# undefined values
full[BsmtCond == "noBasement" & is.na(BsmtUnfSF), BsmtUnfSF := 0]
full[BsmtCond == "noBasement" & is.na(TotalBsmtSF), TotalBsmtSF := 0]
full[BsmtFinType1 == "noBasement" & is.na(BsmtFinSF1), BsmtFinSF1 := 0]
full[BsmtFinType2 == "noBasement" & is.na(BsmtFinSF2), BsmtFinSF2 := 0]
full[GarageType == "noGarage" & is.na(GarageYrBlt), GarageYrBlt := 0]
full[GarageType == "noGarage" & is.na(GarageCars), GarageCars := 0]
full[GarageType == "noGarage" & is.na(GarageArea), GarageArea := 0]
full[PoolQC == "noPool" & is.na(PoolArea), PoolArea := 0]
full[MiscFeature == "noFeature" & is.na(MiscVal), MiscVal := 0]
Missing Values
# single imputation
full[c(949, 1488, 2349), BsmtExposure := "No"]
full[c(2041, 2186, 2525), BsmtCond := "TA"]
full[c(2218, 2219), BsmtQual := "TA"]
full[c(2127, 2577), GarageYrBlt := 0]
full[c(2127, 2577), GarageCars := 0]
full[c(2127, 2577), GarageArea := 0]
full[c(2127, 2577), GarageType := "noGarage"]
full[is.na(Exterior1st), Exterior1st := "VinylSd"]
full[is.na(Exterior2nd), Exterior2nd := "VinylSd"]
full[is.na(Electrical), Electrical := "SBrkr"]
full[is.na(KitchenQual), KitchenQual := "TA"]
full[is.na(SaleType), SaleType := "WD"]
full[is.na(Utilities), Utilities := "AllPub"]
full[is.na(Functional), Functional := "Typ"]
full[is.na(MSZoning), MSZoning := "RL"]
full[is.na(MasVnrArea), MasVnrArea := 202]
full[is.na(MasVnrType), MasVnrType := "none"]
full[is.na(BsmtFullBath), BsmtFullBath := 0]
full[is.na(BsmtHalfBath), BsmtHalfBath := 0]
# transform
full$SalePrice <- log(full$SalePrice)
for (j in contVars) {
full[[j]] <- log10(full[[j]] + 1)
}
# multiple imputation
vars <- c(contVars, discVars, "BldgType", "MSSubClass", "HouseStyle")
imp <- mice(full[, vars, with = FALSE], seed = 42)
full$BldgType <- complete(imp)$BldgType
full$LotFrontage <- complete(imp)$LotFrontage
Important Features
impVars <- c("MSSubClass", "MSZoning", "LotFrontage", "LotArea", "LandContour",
"Neighborhood", "BldgType", "HouseStyle", "OverallQual",
"OverallCond", "YearBuilt", "YearRemodAdd", "Exterior1st",
"Exterior2nd", "MasVnrArea", "ExterQual", "Foundation",
"BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
"BsmtFinSF1", "BsmtUnfSF", "TotalBsmtSF", "HeatingQC",
"CentralAir", "X1stFlrSF", "X2ndFlrSF", "GrLivArea",
"BsmtFullBath", "FullBath", "HalfBath", "BedroomAbvGr",
"KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Fireplaces",
"FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish",
"GarageCars", "GarageArea", "GarageQual", "GarageCond",
"PavedDrive", "WoodDeckSF", "OpenPorchSF")
Adjust Features
# binary features
full$noAccess <- full$Alley == "noAccess"
full$noBasement <- full$BsmtQual == "noBasement" | full$BsmtCond == "noBasement"
full$noFireplace <- full$FireplaceQu == "noFireplace"
full$noGarage <- full$GarageType == "noGarage" | full$GarageCond == "noGarage"
full$noPool <- full$PoolQC == "noPool"
full$noFence <- full$Fence == "noFence"
full$noMisc <- full$MiscFeature == "none"
binVars <- c("noAccess", "noBasement", "noFireplace", "noGarage",
"noPool", "noFence", "noMisc")
# ordinal to integer
for (j in ordVars) {
full[[j]] <- as.integer(full[[j]])
}
The dataset is split back into training and test.
train <- full[label == "train", !c("Id", "label")]
rownames(train) <- full[label == "train", Id]
test <- full[label == "test", !c("Id", "label", "SalePrice")]
rownames(test) <- full[label == "test", Id]
Categorical variables are still coded as factors. Here, they are split up into binary dummy variables.
X <- sparse.model.matrix(~ -1 + ., data = train[, !"SalePrice"])
Y <- train$SalePrice
To see overfitting during parameter estimation the model must be validated on data which was not used for training. A cross validation will be done during each boosting iteration. Therefore, the whole training set is used for training and validation.
dtrain <- xgb.DMatrix(data = X, label = Y)
Regression trees will be the base learners of this boosting model. The function to optimize is the objective which has a usual loss function and a function to penalize complexity. In the linear regression setting the loss function is a quadratic loss. (Doc says ‘linear regression’ so I assume it’s a quadratic loss.)
In a MART fashion the algorithm will add one tree after another, fitting the new trew to the pseudo-residuals of the previous one. Regularization for this process comes in many forms.
Parameter \(\eta\) (the stepsize) reduces the influence of each newly learned tree. It is closely related to \(L_1\) regularization in linear regression. \(\eta\) is usually set very small \(<0.1\). The smaller \(\eta\) is, the slower the model learns. So, more iterations are needed as well. The strategy here is to compute enough iterations for a given \(\eta\), watch the validation error, and then chose the number of iterations with the minimum validation error. (I start with 10000 iterations and eta = 0.01 )
The maximum allowed tree depth per model is another parameter which can limit the complexity of the model. High maximum tree depth (> 6) would represent interactions of many variables in the data. A MARS model that I fitted on the data earlier suggested no interactions. (Additive models did perform better than models with interaction terms) Therefore, small trees or stumps seem to be more approprate here. (I will look around a maxdepth of 2, 4, 6 first).
Some additional parameter which reduces overfitting is the subsampling rate. This is the proportion of data points which get sampled (without replacement) before fitting each tree. A usual value here is 50%. This reduces the learning process as well, but it also speeds up computation. Here is one initial run:
Unfortunately I have to comment out all training functions for Kaggle
pars <- list(
eta = 0.01,
gamma = 0,
max_depth = 4,
min_child_weight = 1,
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
# fit <- xgb.cv(params = pars, data = dtrain, nrounds = 10000, nfold = 5)
By default xgb.train already computes RMSE on the training and validation set in the lienar regression setting. Below is a plot of the learning process showing mean RMSE and standard deviation for training and validation (test). 10000 iterations seem sufficient to reach minimum test error when \(\eta = 0.01\), 50% subsampling and 80% column sampling is used.
#dt <- fit$evaluation_log
#df <- data.frame(
# iter = rep(dt$iter, 2),
# set = rep(c("train", "test"), each = nrow(dt)),
# mean = c(dt$train_rmse_mean, dt$test_rmse_mean),
# sd = c(dt$train_rmse_std, dt$test_rmse_std)
#)
#ggplot(df, aes(x = iter, y = mean, group = set)) +
# geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd), alpha = .2) +
# geom_path(aes(color = set)) +
# scale_y_continuous(limits = c(0, .2)) +
# theme_bw()
One can prevent the formation of leafs by setting a minimum child weight. A split would not be performed if both partitions would not get a minimum number of observations. (As far as I know, in the regression setting the number of observations ending up in a single leaf can be directly translated to child weight.) Appropriate numbers for minimum child weight have to be found. With a maximum allowed tree depth of 4 and around 1600 data points it seems appropriate to look for the optimum child weight in the range of 0 - 100.
There is another parameter \(\gamma\) which prevents trees from growing. It only allows to further grow a tree if a minimum increase in RMSE (here) is achieved by the partition.
Then, there are some parameters where I have no idea where to start. One is \(\alpha\), which is a \(L_1\) regularization for weights. The other is \(colsample_bytree\). This is another sampling method which samples a proportion of variables (columns in \(X\)) before building each tree.
Tree Depth
The maximum tree depth and minimum child weight have a huge influence on the performance. First, their best values are found. Therefore, a first grid search in big steps is performed. This is the first grid:
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = c(2, 4, 6),
min_child_weight = c(1, 20, 40, 80),
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
Each parameter combination will be trained and the lowest RMSE and its iteration number for each parameter combination will be returned. An early stopping condition is added, so that the process stops if after 1000 iterations there has not been any decrease in RMSE.
train.grid <- function(train, pars, n = 10000, objFun = NULL, nstop = 1000) {
N <- nrow(pars)
out <- data.frame(iter = integer(N), mean = numeric(N), sd = numeric(N))
for (i in seq_len(N)) {
fit <- xgb.cv(params = lapply(pars, function(x) x[i]),
data = train, nrounds = n, nfold = 5, obj = objFun,
early_stopping_rounds = nstop
)
dt <- fit$evaluation_log
idx <- which.min(dt$test_rmse_mean)
out$iter[i] <- idx
out$mean[i] <- dt$test_rmse_mean[idx]
out$sd[i] <- dt$test_rmse_std[idx]
}
return(out)
}
The training is done below. For the Kaggle kernel all these grid ssearches are commented. (They all run for a few minutes) The code below summarizes the results of all parameter combinations as a plot.
# res <- train.grid(dtrain, pars.grid)
# df <- cbind(res, pars.grid)
# df[which.min(df$mean), ]
# pd <- position_dodge(width = 3)
# ggplot(df, aes(x = min_child_weight, color = as.factor(max_depth))) +
# geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), position = pd) +
# geom_point(aes(y = mean), position = pd) +
# theme_bw()
Minimum child weight of 0 and maximum tree depth of 2 seem best. Although most error bars are overlapping. In a second search a smaller parameter space around these values is searched.
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = c(1, 2, 3),
min_child_weight = c(1, 5, 10),
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
# res <- train.grid(dtrain, pars.grid)
# df <- cbind(res, pars.grid)
# df[which.min(df$mean), ]
# pd <- position_dodge(width = .5)
# ggplot(df, aes(x = min_child_weight, color = as.factor(max_depth))) +
# geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), position = pd) +
# geom_point(aes(y = mean), position = pd) +
# theme_bw()
A minimum mean RMSE is reached with maximum tree depth of 3 and minimum child weight of 1. However, all error bars are overlapping now. The changes in performance might not be significant anymore.
Regularization
Next, \(\gamma\) and \(\alpha\) are tested.
pars.grid <- expand.grid(
eta = 0.01,
gamma = c(0, 0.1, 0.2, 0.3),
max_depth = 3,
min_child_weight = 1,
alpha = c(0, 0.3, 0.7),
subsample = 0.5,
colsample_bytree = 0.8
)
# res <- train.grid(dtrain, pars.grid)
# df <- cbind(res, pars.grid)
# df[which.min(df$mean), ]
# pd <- position_dodge(width = .01)
# ggplot(df, aes(x = gamma, color = as.factor(alpha))) +
# geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), position = pd) +
# geom_point(aes(y = mean), position = pd) +
# theme_bw()
\(\gamma = 0.1\) and \(\alpha = 0\) performed best. The differences do not seem significant though.
Subsampling
Next, row and column subsampling is analyzed.
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0.1,
max_depth = 3,
min_child_weight = 1,
alpha = 0,
subsample = c(0.3, 0.5, 0.8),
colsample_bytree = c(0.3, 0.5, 0.8)
)
# res <- train.grid(dtrain, pars.grid)
# df <- cbind(res, pars.grid)
# df[which.min(df$mean), ]
# pd <- position_dodge(width = .01)
# ggplot(df, aes(x = subsample, color = as.factor(colsample_bytree))) +
# geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), position = pd) +
# geom_point(aes(y = mean), position = pd) +
# theme_bw()
30% column subsampling by tree with 30% subsampling performed best. Again, the differences seem not significant. The best validation RMSE is now at \(0.122(0.006)\) after around 9900 iterations.
pars1 <- list(
eta = 0.01,
gamma = 0.1,
max_depth = 3,
min_child_weight = 1,
alpha = 0,
subsample = 0.3,
colsample_bytree = 0.3
)
Quadratic loss can be sensitive to outliers. Other loss functions, such as the Huber loss, become linear with increasing residual vlaues. xgb.train allows to provide custom loss functions.
The function below should describe a Huber loss. It takes predictions and the training object and returns first and second order gradients.
Huber <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
grad <- ifelse(abs(preds - labels < 1.5),
preds - labels,
1.5 * sign(preds - labels))
hess <- ifelse(abs(preds - labels < 1.5), 1, 0)
return(list(grad = grad, hess = hess))
}
Now the same parameter tuning as above is performed again.
Tree Depth
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = c(2, 4, 6),
min_child_weight = c(1, 20, 40, 80),
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
# res <- train.grid(dtrain, pars.grid, objFun = Huber)
# df <- cbind(res, pars.grid)
# df[which.min(df$mean), ]
# pd <- position_dodge(width = 3)
# ggplot(df, aes(x = min_child_weight, color = as.factor(max_depth))) +
# geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), position = pd) +
# geom_point(aes(y = mean), position = pd) +
# theme_bw()
As before, minimum child weight is the main parameter. Maximum depth of 2 and minimum child weight of 1 wins.
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = c(1, 2, 3),
min_child_weight = c(1, 5, 10),
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.8
)
# res <- train.grid(dtrain, pars.grid, objFun = Huber)
Maximum depth of 3 and minimum child weight of 1 wins. Again, the differences are very smaller than the standard deviations.
# df <- cbind(res, pars.grid)
# df[which.min(df$mean), ]
# pd <- position_dodge(width = 1)
# ggplot(df, aes(x = min_child_weight, color = as.factor(max_depth))) +
# geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), position = pd) +
# geom_point(aes(y = mean), position = pd) +
# theme_bw()
Regularization
Next, \(\gamma\) and \(\alpha\) are tested.
pars.grid <- expand.grid(
eta = 0.01,
gamma = c(0, 0.1, 0.2, 0.3),
max_depth = 3,
min_child_weight = 1,
alpha = c(0, 0.3, 0.7),
subsample = 0.5,
colsample_bytree = 0.8
)
# res <- train.grid(dtrain, pars.grid, objFun = Huber)
\(\gamma = 0\) and \(\alpha = 0\) wins.
# df <- cbind(res, pars.grid)
# df[which.min(df$mean), ]
# pd <- position_dodge(width = .01)
# ggplot(df, aes(x = gamma, color = as.factor(alpha))) +
# geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), position = pd) +
# geom_point(aes(y = mean), position = pd) +
# theme_bw()
Subsampling
Next, row and column subsampling is tested.
pars.grid <- expand.grid(
eta = 0.01,
gamma = 0,
max_depth = 3,
min_child_weight = 1,
alpha = 0,
subsample = c(0.3, 0.5, 0.8),
colsample_bytree = c(0.3, 0.5, 0.8)
)
# res <- train.grid(dtrain, pars.grid, objFun = Huber)
50% row and 30% column subsampling wins. Although the differences do not seem significant. The best validation RMSE is \(0.12(0.01)\) after roughly 3760 iterations.
# df <- cbind(res, pars.grid)
# df[which.min(df$mean), ]
# pd <- position_dodge(width = .01)
# ggplot(df, aes(x = subsample, color = as.factor(colsample_bytree))) +
# geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), position = pd) +
# geom_point(aes(y = mean), position = pd) +
# theme_bw()
The final parameters are:
pars2 <- list(
eta = 0.01,
gamma = 0,
max_depth = 3,
min_child_weight = 1,
alpha = 0,
subsample = 0.5,
colsample_bytree = 0.3
)
Both models – Huber and Quadratic Loss – reached more or less the same mean RMSE. However, the Quadratic Loss models RMSE estimate had a lower standard deviation. Therefore, it will be chosen as the final model.
dX <- xgb.DMatrix(data = X, label = Y)
fit <- xgb.train(params = pars1, data = dX, nround = 9900)
Feature Importance
There are a few tools for inspecting the final model. For example feature imprtance can be estimated by the weighted abundance of each feature in the model. The 20 most important features are shown below. Like in the MARS model that I trained before GrLivArea and OverallQual are the two most imprtant features by far. The rank of features below however differes compared to the MARS model.
dt <- xgb.importance(colnames(dX), fit)
xgb.plot.importance(dt[1:20, ])
Tree Structures
There is a tool that represents the tree ensemble as a single merged tree. Since the trees here have a maximum depth of 3, it should still be reasonable to plot it. Nodes in the graph is represent leafs and splits. Nodes with only one vertice represent leafs.
xgb.plot.multi.trees(fit, colnames(dX))
Partial Dependency Plots
The ICEbox package offers functions for partial dependency plots. With partial dependency plots one can visualize the influence of a single predictor variable on the target variable. The target is plotted with the desired predictor. Since the target is also dependent on the other predictors, which are not shown, the expected value of the target over all unseen predictors is shown.
Below are the partial dependency plots for the 3 most important features. Note that the target SalePrice is still log transformed. Only 50% of the data points in \(X\) are used. This speeds up computation.
X <- model.matrix(~ -1 + ., data = train[, !"SalePrice"])
Y <- train$SalePrice
xgb.ice <- ice(fit, X, Y, "GrLivArea", frac_to_build = 0.5)
plot(xgb.ice)
xgb.ice <- ice(fit, X, Y, "OverallQual", frac_to_build = 0.5)
plot(xgb.ice)
xgb.ice <- ice(fit, X, Y, "TotalBsmtSF", frac_to_build = 0.5)
plot(xgb.ice)
Finally the test set can be predicted.
Xtest <- sparse.model.matrix(~ -1 + ., data = test)
pred <- predict(fit, Xtest)
The distribution of predicted sale prices is shown below.
df <- data.frame(
Id = full[label == "test", Id],
SalePrice = exp(pred)
)
summary(df$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41180 128100 155800 177200 209100 553000
ggplot(df, aes(x = SalePrice)) + geom_histogram(bins = 30) + theme_bw()
That’s it! Thanks for reading through this whole thing. If you have any suggestions, I’m quite new to XGBoost, comments are welcome.
write.csv(df, file = "XGB_submission.csv", row.names = FALSE, quote = FALSE)